# load raw data files
data <- read.csv("../data/filledDatabase.csv")[,-c(2:9,11:13)]

# clean data 
data <- clean_data(data) %>% collapse_data()

# separate compound and group_cate from the predictors
compound <- data$Compound
group_cat <- data$GroupCat
space_group <- data$SpaceGroup

# create data constructed by first 13 PC's
data <- select(data, -c("Compound","X","Z","SpaceGroup","SpaceGroupNumber"))
# data_pca <- get_pc_space(data[,-1], k = 13) %>% scale() %>% data.frame()

# split data into 5 folds for cross validation later
folds <- caret::createFolds(1:nrow(data), k = 5, list = TRUE, returnTrain = FALSE)

Multinomial Regression

library(glmnet)
X = data[,-1] %>% as.matrix()
Y = data$GroupCat %>% as.matrix()

Shrinkage

Ridge

model_ridge <- glmnet(x = X, y = Y, alpha = 0, family = "multinomial")
plot(model_ridge, xvar = "lambda", label = TRUE)

LASSO

model_lasso <- glmnet(x = X, y = Y, alpha = 1, family = "multinomial")
plot(model_lasso, xvar = "lambda", label = TRUE)

Coefficient

Ridge

ridge_cv <- cv.glmnet(x = X, y = Y, alpha = 0, nfolds = 5, type.measure = "deviance", family = "multinomial") 
ridge_cv %>% 
  get_coef(tuning_parameter = ridge_cv$lambda.min) %>% 
  select(feature, Cubic, Tilted, Others) %>% 
  filter(feature != "(Intercept)") %>% 
  plot_coef()

LASSO

lasso_cv <- cv.glmnet(x = X, y = Y, alpha = 1, nfolds = 5, type.measure = "deviance", family = "multinomial")
lasso_cv %>% 
  get_coef(tuning_parameter = lasso_cv$lambda.min) %>% 
  select(feature, Cubic, Tilted, Others) %>% 
  filter(feature != "(Intercept)") %>% 
  plot_coef()

Elastic Net

library(caret)
elastic_cv <- 
  train(GroupCat ~., data = data, method = "glmnet",
    trControl = trainControl("cv", number = 5),
    tuneLength = 10
    )
elastic_cv$finalModel %>% 
  get_coef(tuning_parameter = elastic_cv$bestTune$lambda) %>% 
  select(feature, Cubic, Tilted, Others) %>% 
  filter(feature != "(Intercept)") %>% 
  plot_coef()

Accurate classification rate

Ridge

tb_ridge = prediction_table(alpha = 0, lambda = ridge_cv$lambda.min) 
tb_ridge$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.8533333 0.9054054 0.8513514 0.8108108 0.7567568 0.8355315
tb_ridge$t %>% highlight_tb_count()
Cubic Others Tilted
Cubic 158 5 28
Others 0 28 3
Tilted 21 4 124
Total 179 37 155
tb_ridge$t %>% highlight_tb_percent()
Cubic Others Tilted
Cubic 0.88 0.14 0.18
Others 0 0.76 0.02
Tilted 0.12 0.11 0.8
Total 100% 100% 100%

LASSO

tb_lasso = prediction_table(alpha = 1, lambda = lasso_cv$lambda.min) 
tb_lasso$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.8666667 0.8918919 0.8648649 0.8513514 0.7972973 0.8544144
tb_lasso$t %>% highlight_tb_count() 
Cubic Others Tilted
Cubic 159 5 24
Others 0 28 1
Tilted 20 4 130
Total 179 37 155
tb_lasso$t %>% highlight_tb_percent()
Cubic Others Tilted
Cubic 0.89 0.14 0.15
Others 0 0.76 0.01
Tilted 0.11 0.11 0.84
Total 100% 100% 100%

Elastic Net

tb_elastic = prediction_table(alpha = elastic_cv$bestTune[[1]], lambda = elastic_cv$bestTune[[2]]) 
tb_elastic$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.8933333 0.8378378 0.8378378 0.8243243 0.8243243 0.8435315
tb_elastic$t %>% highlight_tb_count() 
Cubic Others Tilted
Cubic 155 3 24
Others 4 29 2
Tilted 20 5 129
Total 179 37 155
tb_elastic$t %>% highlight_tb_percent()
Cubic Others Tilted
Cubic 0.87 0.08 0.15
Others 0.02 0.78 0.01
Tilted 0.11 0.14 0.83
Total 100% 100% 100%